In [1]:
import pickle
import numpy as np
import pandas as pd
import os, glob
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, cross_validate
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## The following are the ML models which can be used for trasinning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import MinMaxScaler,StandardScaler

import timeit
import warnings
warnings.filterwarnings("ignore")
E:\Anaconda3\envs\tf\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index
In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl

import numpy as np
%matplotlib inline
import pandas as pd
In [3]:
import seaborn as sns
sns.set(style="darkgrid")
In [4]:
sns.set_context('talk')
In [5]:
from IPython.display import display, HTML, Image
In [6]:
import matplotlib.colors as mcolors
In [ ]:
 
In [7]:
dataFolder = os.getcwd()
In [8]:
group1_low =1.634
group2_low =0.673
group2_upper =1.634
group3_upper =0.673
In [9]:
## read the test file
data =pd.read_csv(os.path.join(dataFolder,'10_PC_02_LHS_5000_54854_01_s1_G.csv'))
data.columns =[col.strip() for col in data.columns]
data['ratio'] = data['b(CaO)']/data['b(SiO2)']
group1 = data[data['ratio']>=group1_low]  # with Portlandite, no Amor-S1
group2 = data[(data['ratio']<=group2_upper) & (data['ratio']>=group2_low)]  # no Portlandite, no Amor-S1
group3 = data[data['ratio']<=group3_upper]     # no Portlandite, with Amor-S1
groups = [group1,group2,group3]
In [11]:
## read trained Model info:
modelInfo =pd.read_csv(os.path.join(dataFolder,'ModelTrainSummary_all.csv'))
modelInfo = modelInfo.iloc[:,1:4]
modelInfo
Out[11]:
group var modelType
0 group1 Vol(aq) rfModel
1 group1 Vol(aq) lgbModel
2 group1 Vol(aq) gbModel
3 group1 Vol(aq) GPyModel
4 group1 pH CONST
... ... ... ...
73 group3 nCa(CSHQ) linear
74 group3 nSi(CSHQ) linear
75 group3 nH2O(CSHQ) linear
76 group3 C/S(CSHQ) CONST
77 group3 nGelPW(CSH) linear

78 rows × 3 columns

In [12]:
modelTypeLst = modelInfo['modelType'].unique()
MLModels =[ml for ml in modelTypeLst if 'Model' in ml]
nonMLModels = [ml for ml in modelTypeLst if 'Model' not in ml]
In [ ]:
 
In [13]:
## load the mdoels
dataCols = data.columns.to_list()
modelsLst = []
for irow,row in modelInfo.iterrows():
    group = row['group']
    var =row['var']
    modelType = row['modelType']
    icol =dataCols.index(var)
    if '/' in var:
        fileCol = var.replace("/",'')
    else:
        fileCol = var            

    modelFolder = os.path.join(dataFolder,'SavedModel',group)
    if modelType=='CONST':
        ext = '.csv'
        fileName = os.path.join(modelFolder,modelType + '_'+str(icol)+'_'+fileCol+ext)
        tempModel =pd.read_csv(fileName)
    else:
        ext ='.sav'
        fileName = os.path.join(modelFolder,modelType + '_'+str(icol)+'_'+fileCol+ext)
        tempModel = pickle.load(open(fileName, 'rb'))
    
    modelsLst.append(tempModel)

modelInfo['modelObj'] = modelsLst   
In [ ]:
 
In [14]:
finalDF = pd.DataFrame()
for ML in MLModels:
    modelProcessing =[]
    modelProcessing = nonMLModels.copy()
    modelProcessing.append(ML)
    tempModelInfo = modelInfo[modelInfo['modelType'].isin(modelProcessing)]
    for col in group1.columns[4:-1]:  # exclude 'ratio' 
        colModelInfo = tempModelInfo[tempModelInfo['var'] == col]  
        
        for i, group in enumerate(groups):
        
            dataX = group.iloc[:,1:4]    
            dataY = group[col].values
            groupModelInfo = colModelInfo[colModelInfo['group'] =='group'+str(i+1)]
            modelType = groupModelInfo['modelType'].values[0]
            model = groupModelInfo['modelObj'].values[0]
            if modelType=='CONST':
                y_pred = list(model['const'].values)*len(dataY)
                
            elif modelType=='linear':
                y_pred = model.predict(dataX.values)
            
            else:
                scaler = StandardScaler().fit(dataX.values)
                X_scaled = scaler.transform(dataX.values)  # This will be used for input of trainning dataset
                y_pred  = model.predict(X_scaled)  
    
            tempDF = pd.DataFrame({'testDataY':dataY, 'predDataY':y_pred})
            tempDF['modelType'] = [modelType]*len(tempDF)
            tempDF['var'] = [col]*len(tempDF)
            tempDF['group'] = ['group'+str(i+1)]*len(tempDF)
            tempDF['modelSimulation'] = [ML+'_simulation']*len(tempDF)
                
            if len(finalDF)==0:   
                finalDF = tempDF
            else:
                finalDF = pd.concat([finalDF,tempDF],axis=0)
    
In [15]:
finalDF.to_csv('Comparison_predict_5KDataset.csv',index=False)

Visualization of the prediction of the test dataset¶

In [16]:
def drawPlots(trgVar,testDataY,predDataY,simulation):
    
    fig, ax =plt.subplots(figsize=(8,8))
    
    RMSE = mean_squared_error(testDataY,predDataY)
    R2   = r2_score(testDataY,predDataY)
    axMax = max(max(testDataY),max(predDataY))
    axMin = min(min(testDataY),min(predDataY))
    
    ## Fitting the line           
    x2= np.linspace(axMin,axMax,30);
    y2 = x2
    ax.plot(testDataY,predDataY,'bo',markerfacecolor = 'blue',markeredgecolor ='darkblue',markeredgewidth=0.5,label=trgVar)
    
    #ax.plot(df3['b(SiO2)'].values,df3['b(CaO)'].values,'bo',markerfacecolor = 'c',markeredgecolor ='b',markeredgewidth=0.5,label='no Amor-S1')
    #ax.plot(group3['b(SiO2)'].values,group3['b(CaO)'].values,'g^',markerfacecolor = 'm',markeredgecolor ='m',markeredgewidth=1,label='group3')
    ax.plot(x2,y2,'r-',lw=3,label ='1:1 ratio')
    ax.legend (loc='best',ncol=5)           
    ax.set_xlim(axMin,axMax)
    ax.set_ylim(axMin,axMax)
    #ax.text(0.05, 0.8, 'Above the line with Portlandite',fontsize = 20,color = 'k')
    #ax.text(0.4, 0.50, 'Below the line no Portlandite',fontsize = 20,color = 'k')
    ax.set_title(simulation+ ' ====> '+trgVar)
    ax.set_xlabel('testDataY')
    ax.set_ylabel('PredDataY')    
        
    return R2, RMSE    
In [ ]:
 
In [25]:
modelSimulations = finalDF['modelSimulation'].unique()
trgVars = finalDF['var'].unique()
statMeasure = []
for simulation in modelSimulations:
    if 'MLP' in simulation or 'xgb' in simulation: # skip the xgb and MLP models
        continue
    tempDF = finalDF[finalDF['modelSimulation']==simulation]
    for trgVar in trgVars:
        df = tempDF[tempDF['var']==trgVar]  # no Portlandite
        testDataY = df['testDataY'].values
        predDataY = df['predDataY'].values
        R2,RMSE = drawPlots(trgVar,testDataY,predDataY,simulation)
        statMeasure.append([trgVar,R2,RMSE,simulation])
    
statMeasDF = pd.DataFrame(statMeasure,columns=['Var','R2','RMSE','modelSimulation'])    
In [26]:
statMeasDF
Out[26]:
Var R2 RMSE modelSimulation
0 Vol(aq) 0.991504 8.080633e-06 rfModel_simulation
1 pH 0.990699 7.314908e-03 rfModel_simulation
2 nCa(aq) 0.969484 1.660522e-08 rfModel_simulation
3 nCa(s) 1.000000 1.601451e-08 rfModel_simulation
4 nSi(aq) 0.978627 3.207269e-10 rfModel_simulation
5 nSi(s_reac) 1.000000 2.703401e-10 rfModel_simulation
6 nPortlandite 0.995254 7.027868e-04 rfModel_simulation
7 nAmor-Sl 0.961508 2.075204e-04 rfModel_simulation
8 mCSHQ 0.998835 1.042550e-06 rfModel_simulation
9 nCa(CSHQ) 1.000000 1.716375e-08 rfModel_simulation
10 nSi(CSHQ) 1.000000 2.703401e-10 rfModel_simulation
11 nH2O(CSHQ) 0.998887 2.102353e-04 rfModel_simulation
12 C/S(CSHQ) 0.987923 1.549080e-03 rfModel_simulation
13 nGelPW(CSH) 0.999050 3.530165e-05 rfModel_simulation
14 Vol(aq) 0.991977 7.631142e-06 lgbModel_simulation
15 pH 0.978397 1.698957e-02 lgbModel_simulation
16 nCa(aq) 0.962786 2.025020e-08 lgbModel_simulation
17 nCa(s) 1.000000 1.601451e-08 lgbModel_simulation
18 nSi(aq) 0.982556 2.617732e-10 lgbModel_simulation
19 nSi(s_reac) 1.000000 2.703401e-10 lgbModel_simulation
20 nPortlandite 0.994461 8.201820e-04 lgbModel_simulation
21 nAmor-Sl 0.883784 6.265463e-04 lgbModel_simulation
22 mCSHQ 0.995146 4.342887e-06 lgbModel_simulation
23 nCa(CSHQ) 1.000000 1.716375e-08 lgbModel_simulation
24 nSi(CSHQ) 1.000000 2.703401e-10 lgbModel_simulation
25 nH2O(CSHQ) 0.994282 1.080325e-03 lgbModel_simulation
26 C/S(CSHQ) 0.970519 3.781303e-03 lgbModel_simulation
27 nGelPW(CSH) 0.999050 3.530165e-05 lgbModel_simulation
28 Vol(aq) 0.990172 9.348100e-06 gbModel_simulation
29 pH 0.977410 1.776561e-02 gbModel_simulation
30 nCa(aq) 0.955208 2.437375e-08 gbModel_simulation
31 nCa(s) 1.000000 1.601451e-08 gbModel_simulation
32 nSi(aq) 0.984079 2.389144e-10 gbModel_simulation
33 nSi(s_reac) 1.000000 2.703401e-10 gbModel_simulation
34 nPortlandite 0.989438 1.563931e-03 gbModel_simulation
35 nAmor-Sl 0.816584 9.888377e-04 gbModel_simulation
36 mCSHQ 0.992656 6.570474e-06 gbModel_simulation
37 nCa(CSHQ) 1.000000 1.716375e-08 gbModel_simulation
38 nSi(CSHQ) 1.000000 2.703401e-10 gbModel_simulation
39 nH2O(CSHQ) 0.992220 1.469860e-03 gbModel_simulation
40 C/S(CSHQ) 0.965003 4.488907e-03 gbModel_simulation
41 nGelPW(CSH) 0.999050 3.530165e-05 gbModel_simulation
42 Vol(aq) 0.999616 3.655889e-07 GPyModel_simulation
43 pH 0.996629 2.650814e-03 GPyModel_simulation
44 nCa(aq) 0.998023 1.075643e-09 GPyModel_simulation
45 nCa(s) 1.000000 1.601451e-08 GPyModel_simulation
46 nSi(aq) 0.978039 3.295589e-10 GPyModel_simulation
47 nSi(s_reac) 1.000000 2.703401e-10 GPyModel_simulation
48 nPortlandite 0.999563 6.472451e-05 GPyModel_simulation
49 nAmor-Sl 0.983547 8.870394e-05 GPyModel_simulation
50 mCSHQ 0.999850 1.341174e-07 GPyModel_simulation
51 nCa(CSHQ) 1.000000 1.716375e-08 GPyModel_simulation
52 nSi(CSHQ) 1.000000 2.703401e-10 GPyModel_simulation
53 nH2O(CSHQ) 0.999803 3.731310e-05 GPyModel_simulation
54 C/S(CSHQ) 0.999396 7.748488e-05 GPyModel_simulation
55 nGelPW(CSH) 0.999050 3.530165e-05 GPyModel_simulation
In [ ]:
 
In [27]:
statMeasDF_pivot=statMeasDF.pivot(index='Var',values=['R2','RMSE'],columns='modelSimulation')
statMeasDF.to_csv('statMeasure_trainDataset_noPivot.csv',index = False)

statMeasDF_pivot.to_csv('statMeasure_trainDataset_Pivot.csv')
In [20]:
## alternative way to draw figures
In [ ]:
 
In [21]:
def drawCombiningPlots(plotDataDF,trgVar):
    
    fig, ax =plt.subplots(figsize=(10,10))
    
    marker = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']
    colors = ['tab:blue','tab:orange','tab:gray','tab:olive','tab:cyan','tab:green']
    ## Fitting the line           
    axisMin=1000
    axisMax =-1000
    for i, col in enumerate(plotDataDF.columns[1:]):
        testDataY = plotDataDF.iloc[:,0].values
        predDataY = plotDataDF[col].values
        axMax = max(max(testDataY),max(predDataY))
        axMin = min(min(testDataY),min(predDataY))
        axisMin = min(axisMin,axMin)
        axisMax = max(axisMax,axMax)
        ax.scatter(testDataY,predDataY,marker =marker[i],edgecolor =colors[i],label=col.replace('_simulation',''))

    x2= np.linspace(axisMin,axisMax,30);
    y2 = x2
    ax.plot(x2,y2,'r-',lw=3,label ='1:1 ratio')
    ax.legend (loc='best',ncol=3)           
    ax.set_xlim(axisMin,axisMax)
    ax.set_ylim(axisMin,axisMax)
    ax.set_title(trgVar)
    ax.set_xlabel('testDataY')
    ax.set_ylabel('PredDataY')    
        
In [22]:
modelSimulations = finalDF['modelSimulation'].unique()
trgVars = finalDF['var'].unique()
for trgVar in trgVars:
    tempdf = finalDF[finalDF['var']==trgVar]  # no Portlandite
    dataplot = {}
    plotDataDF = pd.DataFrame()
    for simulation in modelSimulations:
        # skip the xgb and MLP models
        # the two models are very bad, need to be fine tune
        if 'MLP' in simulation or 'xgb' in simulation: 
            continue

        df = tempdf[tempdf['modelSimulation']==simulation]
        if len(plotDataDF)==0:
            plotDataDF=df[['testDataY','predDataY']]
        else:    
            plotDataDF = pd.concat([plotDataDF,df[['predDataY']]],axis=1)

        plotDataDF.rename({'predDataY': simulation}, axis=1,inplace=True)
                                    
    drawCombiningPlots(plotDataDF,trgVar)    
 
In [ ]: